import warnings
import gc
import os.path
import pandas as pd
import numpy as np
from numpy import argmax
import pandas_flavor as pf
import re
from time import time
import tqdm
import pickle
from collections import Counter
from sklearn.metrics import f1_score, fbeta_score, make_scorer
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
warnings.filterwarnings("ignore")
data = pd.read_pickle('../../gen_data/data_to_train.pkl')
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
Ajout d'une ligne à un dataframe
@pf.register_dataframe_method
def add_row(df, row):
df.loc[len(df)] = row
Fonction pour extraire un sample équilibré des données
def get_sample_for_testing(data,ratio):
data_0 = data[data.TARGET == 0]
data_1 = data[data.TARGET == 1]
data_0 = data_0.sample(int(round(len(data_0)*ratio, 0)))
data_1 = data_1.sample(int(round(len(data_1)*ratio, 0)))
data = data_1.append(data_0)
del data_0, data_1
gc.collect()
return data
On définit tout d'abord une fonction de gain métier, où les faux négatifs sont plus pénalisants que ne sont avantageux les vrais négatifs, en utilisant des coefficients multiplicatifs. Les coefficients sont choisis ici de façon arbitraire mais peuvent être définis par le métier de façon plus précise.
def compute_gain(y_true, y_pred):
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
gain = 2* tn - 10*fn
return gain
On calcule un score entre 0 et 1, qui correspond à la proportion du gain maximal que représente le gain obtenu.
def compute_gain_score(y_true, y_pred):
# Pour renvoyer une valeur entre 0 et 1, on prend le pourcentage du gain maximum qu'on pourrait atteindre avec
# une prédiction parfaite
score = compute_gain(y_true, y_pred)/compute_gain(y_true, y_true)
return score
# Initialisation du dataframe
def init_table():
scores_df = pd.DataFrame([], columns=["model name","step","balance", "time",
"auc","f1_score", "f2_score", "accuracy", "precision", "recall", "gain"])
return scores_df
# Calcul des scores et ajouts dans le dataframe des résultats
def evaluate_and_log(model_name, model_pipeline,step, balance, time, X, y, scores_df):
y_pred = model_pipeline.predict(X)
y_pred_proba = model_pipeline.predict_proba(X)
auc_score = roc_auc_score(y, y_pred_proba[:, 1])
F1_score = f1_score(y, y_pred)
f2_score = fbeta_score(y, y_pred, beta=2)
accuracy = accuracy_score(y, y_pred)
gain = compute_gain_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
scores_df.add_row([model_name, step, balance, time, auc_score,F1_score, f2_score, accuracy, precision, recall, gain])
return scores_df
Fonction d'affichage graphique de la matrice de confusion
def plot_confusion_matrix(y_test, test_pred):
conf_mat = confusion_matrix(y_test, test_pred)
fig = px.imshow(conf_mat,text_auto=True,
labels=dict(x="Prédiction", y="Données réelles"),
x=['0', '1'],
y=['0', '1']
)
fig.update_xaxes(side="top")
fig.update_layout(height=400, width=400)
fig.show()
Calcul et affichage des scores suivant un certain seuil de probabilité
# Attribution du label en fonction du seuil
def to_labels(pos_probs, threshold):
return (pos_probs >= threshold).astype('int')
# Affichage des scores
def evaluate_model(model_pipeline, X, y, threshold):
# prediction
y_pred_proba = model_pipeline.predict_proba(X)
y_pred = to_labels(y_pred_proba, threshold)[::,1]
print('Confusion matrix:\n', confusion_matrix(y, y_pred))
print('auc : {:.4f}'.format(
roc_auc_score(y, y_pred_proba[:, 1])))
print('f1_score :{:.4f}'.format(f1_score(y, y_pred)))
print('f2_score :{:.4f}'.format(fbeta_score(y, y_pred, beta=2)))
print('accuracy :{:.4f}'.format(accuracy_score(y, y_pred)))
print('precision :{:.4f}'.format(precision_score(y, y_pred)))
print('recall : {:.4f}'.format( recall_score(y, y_pred)))
print('gain : {:.4f}'.format(compute_gain_score(y, y_pred)))
plot_confusion_matrix(y, y_pred)
On extrait la moitié des données uniquement pour accélérer la pré-analyse.
data_sample = get_sample_for_testing(data, 0.5)
y_sample = data_sample[['TARGET']]
X_sample = data_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns
Initialisation du dataframe des scores
global scores_df
scores_df = init_table()
Fonction permettant de comparer plusieurs modèles afin de choisir le meilleur à optimiser. On peut positionner des choix de stratégies de ré-équilibrage des données (oversampling ou paramètre)
def train_and_evaluate(X, y, scores_df, with_params = False, with_smote=False):
scale_pos_weight = Counter(y['TARGET'])[0]/Counter(y['TARGET'])[1]
balance = 'unbalanced'
if with_params :
balance='param'
classifiers = [
('Dummy classifier', DummyClassifier(strategy="stratified")),
('Logistic Regression', LogisticRegression(class_weight='balanced')),
('RandomForest', RandomForestClassifier(class_weight='balanced')),
('XGBoost', XGBClassifier(scale_pos_weight=scale_pos_weight)),
('Light GBM', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight))
]
else :
if with_smote :
balance='oversampling'
classifiers = [
('Dummy classifier', DummyClassifier()),
('Logistic Regression', LogisticRegression()),
('RandomForest', RandomForestClassifier()),
('XGBoost', XGBClassifier()),
('Light GBM', LGBMClassifier(objective='binary'))]
else :
classifiers = [
('Dummy classifier', DummyClassifier()),
('Logistic Regression', LogisticRegression()),
('RandomForest', RandomForestClassifier()),
('XGBoost', XGBClassifier()),
('Light GBM', LGBMClassifier(objective='binary'))
]
skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
for clf_name, clf in tqdm.tqdm(classifiers):
print(clf_name)
print('===============================')
# Entraîner le classifieur sur les données d'entraînement
pipeline = Pipeline(steps=[
('scaler', RobustScaler()),
('classifier', clf)
]
)
for i, (train_index, test_index) in enumerate(skfolds.split(X, y)):
start = time()
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_test = X.iloc[test_index]
y_test = y.iloc[test_index]
if with_smote:
over_only = SMOTE()
print('Sampling')
# transform the dataset
X_train_re, y_train_re = over_only.fit_resample(
X_train, y_train)
curr_clf = pipeline.fit(X_train_re, y_train_re)
else:
curr_clf = pipeline.fit(X_train, y_train)
duration = time()-start
print(clf_name + ' -- fold n°' + str(i))
print('-------------------------------')
scores_df = evaluate_and_log(
clf_name, curr_clf, 'train',balance, duration, X_train, y_train, scores_df)
scores_df = evaluate_and_log(
clf_name, curr_clf, 'test', balance, duration, X_test, y_test, scores_df)
print('===============================')
return scores_df
scores_df = train_and_evaluate(X_sample, y_sample, scores_df)
0%| | 0/5 [00:00<?, ?it/s]
Dummy classifier =============================== Dummy classifier -- fold n°0 ------------------------------- Dummy classifier -- fold n°1 ------------------------------- Dummy classifier -- fold n°2 ------------------------------- Dummy classifier -- fold n°3 ------------------------------- Dummy classifier -- fold n°4 -------------------------------
20%|██ | 1/5 [00:10<00:40, 10.01s/it]
=============================== Logistic Regression =============================== Logistic Regression -- fold n°0 ------------------------------- Logistic Regression -- fold n°1 ------------------------------- Logistic Regression -- fold n°2 ------------------------------- Logistic Regression -- fold n°3 ------------------------------- Logistic Regression -- fold n°4 -------------------------------
40%|████ | 2/5 [00:34<00:56, 18.69s/it]
=============================== RandomForest =============================== RandomForest -- fold n°0 ------------------------------- RandomForest -- fold n°1 ------------------------------- RandomForest -- fold n°2 ------------------------------- RandomForest -- fold n°3 ------------------------------- RandomForest -- fold n°4 -------------------------------
60%|██████ | 3/5 [08:58<08:00, 240.21s/it]
=============================== XGBoost =============================== XGBoost -- fold n°0 ------------------------------- XGBoost -- fold n°1 ------------------------------- XGBoost -- fold n°2 ------------------------------- XGBoost -- fold n°3 ------------------------------- XGBoost -- fold n°4 -------------------------------
80%|████████ | 4/5 [11:00<03:13, 193.56s/it]
=============================== Light GBM =============================== Light GBM -- fold n°0 ------------------------------- Light GBM -- fold n°1 ------------------------------- Light GBM -- fold n°2 ------------------------------- Light GBM -- fold n°3 ------------------------------- Light GBM -- fold n°4 -------------------------------
100%|██████████| 5/5 [11:29<00:00, 137.91s/it]
===============================
scores_df = train_and_evaluate(X_sample, y_sample, with_smote=True, scores_df = scores_df)
0%| | 0/5 [00:00<?, ?it/s]
Dummy classifier =============================== Sampling Dummy classifier -- fold n°0 ------------------------------- Sampling Dummy classifier -- fold n°1 ------------------------------- Sampling Dummy classifier -- fold n°2 ------------------------------- Sampling Dummy classifier -- fold n°3 ------------------------------- Sampling Dummy classifier -- fold n°4 -------------------------------
20%|██ | 1/5 [00:21<01:26, 21.75s/it]
=============================== Logistic Regression =============================== Sampling Logistic Regression -- fold n°0 ------------------------------- Sampling Logistic Regression -- fold n°1 ------------------------------- Sampling Logistic Regression -- fold n°2 ------------------------------- Sampling Logistic Regression -- fold n°3 ------------------------------- Sampling Logistic Regression -- fold n°4 -------------------------------
40%|████ | 2/5 [01:09<01:51, 37.05s/it]
=============================== RandomForest =============================== Sampling RandomForest -- fold n°0 ------------------------------- Sampling RandomForest -- fold n°1 ------------------------------- Sampling RandomForest -- fold n°2 ------------------------------- Sampling RandomForest -- fold n°3 ------------------------------- Sampling RandomForest -- fold n°4 -------------------------------
60%|██████ | 3/5 [15:59<14:12, 426.38s/it]
=============================== XGBoost =============================== Sampling XGBoost -- fold n°0 ------------------------------- Sampling XGBoost -- fold n°1 ------------------------------- Sampling XGBoost -- fold n°2 ------------------------------- Sampling XGBoost -- fold n°3 ------------------------------- Sampling XGBoost -- fold n°4 -------------------------------
80%|████████ | 4/5 [20:30<06:05, 365.21s/it]
=============================== Light GBM =============================== Sampling Light GBM -- fold n°0 ------------------------------- Sampling Light GBM -- fold n°1 ------------------------------- Sampling Light GBM -- fold n°2 ------------------------------- Sampling Light GBM -- fold n°3 ------------------------------- Sampling Light GBM -- fold n°4 -------------------------------
100%|██████████| 5/5 [21:30<00:00, 258.07s/it]
===============================
scores_df = train_and_evaluate(X_sample, y_sample, with_params=True, scores_df = scores_df)
0%| | 0/5 [00:00<?, ?it/s]
Dummy classifier =============================== Dummy classifier -- fold n°0 ------------------------------- Dummy classifier -- fold n°1 ------------------------------- Dummy classifier -- fold n°2 ------------------------------- Dummy classifier -- fold n°3 ------------------------------- Dummy classifier -- fold n°4 -------------------------------
20%|██ | 1/5 [00:10<00:41, 10.30s/it]
=============================== Logistic Regression =============================== Logistic Regression -- fold n°0 ------------------------------- Logistic Regression -- fold n°1 ------------------------------- Logistic Regression -- fold n°2 ------------------------------- Logistic Regression -- fold n°3 ------------------------------- Logistic Regression -- fold n°4 -------------------------------
40%|████ | 2/5 [00:33<00:54, 18.14s/it]
=============================== RandomForest =============================== RandomForest -- fold n°0 ------------------------------- RandomForest -- fold n°1 ------------------------------- RandomForest -- fold n°2 ------------------------------- RandomForest -- fold n°3 ------------------------------- RandomForest -- fold n°4 -------------------------------
60%|██████ | 3/5 [07:24<06:34, 197.21s/it]
=============================== XGBoost =============================== XGBoost -- fold n°0 ------------------------------- XGBoost -- fold n°1 ------------------------------- XGBoost -- fold n°2 ------------------------------- XGBoost -- fold n°3 ------------------------------- XGBoost -- fold n°4 -------------------------------
80%|████████ | 4/5 [09:11<02:41, 161.83s/it]
=============================== Light GBM =============================== Light GBM -- fold n°0 ------------------------------- Light GBM -- fold n°1 ------------------------------- Light GBM -- fold n°2 ------------------------------- Light GBM -- fold n°3 ------------------------------- Light GBM -- fold n°4 -------------------------------
100%|██████████| 5/5 [09:40<00:00, 116.15s/it]
===============================
scores_df.to_pickle('../../gen_data/partial_data_scores_df.pkl')
scores_df
| model name | step | balance | time | auc | f1_score | f2_score | accuracy | precision | recall | gain | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Dummy classifier | train | unbalanced | 0.739854 | 0.500000 | 0.000000 | 0.000000 | 0.919279 | 0.000000 | 0.000000 | 0.560955 |
| 1 | Dummy classifier | test | unbalanced | 0.739854 | 0.500000 | 0.000000 | 0.000000 | 0.919255 | 0.000000 | 0.000000 | 0.560811 |
| 2 | Dummy classifier | train | unbalanced | 0.708848 | 0.500000 | 0.000000 | 0.000000 | 0.919279 | 0.000000 | 0.000000 | 0.560955 |
| 3 | Dummy classifier | test | unbalanced | 0.708848 | 0.500000 | 0.000000 | 0.000000 | 0.919255 | 0.000000 | 0.000000 | 0.560811 |
| 4 | Dummy classifier | train | unbalanced | 0.726699 | 0.500000 | 0.000000 | 0.000000 | 0.919271 | 0.000000 | 0.000000 | 0.560907 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 145 | Light GBM | test | param | 3.411453 | 0.759413 | 0.280415 | 0.430673 | 0.722448 | 0.177311 | 0.670024 | 0.582193 |
| 146 | Light GBM | train | param | 3.671558 | 0.841317 | 0.323627 | 0.499132 | 0.736196 | 0.204048 | 0.781772 | 0.636371 |
| 147 | Light GBM | test | param | 3.671558 | 0.771880 | 0.281681 | 0.435140 | 0.718708 | 0.177406 | 0.683320 | 0.582794 |
| 148 | Light GBM | train | param | 3.354270 | 0.843957 | 0.328400 | 0.505394 | 0.739537 | 0.207365 | 0.788822 | 0.642482 |
| 149 | Light GBM | test | param | 3.354270 | 0.753442 | 0.272394 | 0.420097 | 0.716302 | 0.171750 | 0.657937 | 0.571262 |
150 rows × 11 columns
mean_scores_df = scores_df.groupby(by=['model name', 'balance','step']).agg({'time': sum, 'auc': 'mean',
'accuracy': 'mean', 'f1_score': 'mean', 'f2_score':'mean',
'precision': 'mean', 'recall': 'mean',
'gain': 'mean'})
mean_scores_df.to_pickle('../../gen_data/mean_scores.pkl')
mean_scores_df
| time | auc | accuracy | f1_score | f2_score | precision | recall | gain | |||
|---|---|---|---|---|---|---|---|---|---|---|
| model name | balance | step | ||||||||
| Dummy classifier | oversampling | test | 15.535323 | 0.500000 | 0.919274 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.560926 |
| train | 15.535323 | 0.500000 | 0.919274 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.560926 | ||
| param | test | 3.726697 | 0.500318 | 0.851686 | 0.081946 | 0.081987 | 0.081886 | 0.082018 | 0.516212 | |
| train | 3.726697 | 0.500027 | 0.851384 | 0.080663 | 0.080726 | 0.080562 | 0.080769 | 0.515445 | ||
| unbalanced | test | 3.612703 | 0.500000 | 0.919274 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.560926 | |
| train | 3.612703 | 0.500000 | 0.919274 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.560926 | ||
| Light GBM | oversampling | test | 48.450429 | 0.751584 | 0.918884 | 0.049808 | 0.032462 | 0.457271 | 0.026346 | 0.569756 |
| train | 48.450429 | 0.794139 | 0.920141 | 0.063529 | 0.041362 | 0.595288 | 0.033556 | 0.573656 | ||
| param | test | 18.050525 | 0.759759 | 0.718539 | 0.276419 | 0.425886 | 0.174406 | 0.665969 | 0.576491 | |
| train | 18.050525 | 0.842636 | 0.737737 | 0.326420 | 0.503111 | 0.205900 | 0.787182 | 0.639952 | ||
| unbalanced | test | 18.185600 | 0.758627 | 0.919274 | 0.047278 | 0.030637 | 0.500443 | 0.024815 | 0.569643 | |
| train | 18.185600 | 0.844489 | 0.922354 | 0.091838 | 0.059915 | 0.823075 | 0.048642 | 0.581362 | ||
| Logistic Regression | oversampling | test | 41.010011 | 0.745781 | 0.688023 | 0.261094 | 0.414782 | 0.161417 | 0.682727 | 0.549182 |
| train | 41.010011 | 0.747601 | 0.688747 | 0.261570 | 0.415268 | 0.161780 | 0.682767 | 0.549983 | ||
| param | test | 16.703306 | 0.752670 | 0.691470 | 0.265456 | 0.420936 | 0.164308 | 0.690624 | 0.555705 | |
| train | 16.703306 | 0.755012 | 0.691587 | 0.266128 | 0.422084 | 0.164701 | 0.692717 | 0.556568 | ||
| unbalanced | test | 18.165263 | 0.752018 | 0.919469 | 0.029771 | 0.019000 | 0.541052 | 0.015307 | 0.566516 | |
| train | 18.165263 | 0.754223 | 0.919471 | 0.029924 | 0.019099 | 0.542283 | 0.015388 | 0.566546 | ||
| RandomForest | oversampling | test | 827.213363 | 0.710726 | 0.918585 | 0.023098 | 0.014785 | 0.368752 | 0.011924 | 0.564365 |
| train | 827.213363 | 1.000000 | 0.999998 | 0.999990 | 0.999984 | 1.000000 | 0.999980 | 0.999991 | ||
| param | test | 354.539444 | 0.723775 | 0.919222 | 0.002569 | 0.001610 | 0.450476 | 0.001289 | 0.561322 | |
| train | 354.539444 | 1.000000 | 0.999959 | 0.999748 | 0.999597 | 1.000000 | 0.999496 | 0.999779 | ||
| unbalanced | test | 440.370899 | 0.717079 | 0.919398 | 0.005296 | 0.003320 | 0.677121 | 0.002659 | 0.561995 | |
| train | 440.370899 | 1.000000 | 0.999964 | 0.999778 | 0.999645 | 1.000000 | 0.999557 | 0.999805 | ||
| XGBoost | oversampling | test | 259.086710 | 0.740912 | 0.917134 | 0.080395 | 0.054509 | 0.386069 | 0.044876 | 0.574362 |
| train | 259.086710 | 0.885527 | 0.929619 | 0.250665 | 0.175124 | 0.891882 | 0.145827 | 0.623402 | ||
| param | test | 95.893754 | 0.733688 | 0.769523 | 0.275528 | 0.391096 | 0.184615 | 0.542943 | 0.588738 | |
| train | 95.893754 | 0.923690 | 0.820230 | 0.442003 | 0.630768 | 0.294915 | 0.881868 | 0.762949 | ||
| unbalanced | test | 109.247933 | 0.745733 | 0.917954 | 0.084207 | 0.056850 | 0.426796 | 0.046729 | 0.575904 | |
| train | 109.247933 | 0.914073 | 0.932457 | 0.296215 | 0.210192 | 0.932298 | 0.176100 | 0.637124 |
mean_scores_df = mean_scores_df.reset_index()
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'train'], x="balance", y="time", color='model name', barmode='group')
fig.update_layout(title="Temps d'entraînement des modèles")
fig.show('notebook')
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'test'], x="balance", y="auc", color='model name', barmode='group')
fig.update_layout(title = 'AUC')
fig.show('notebook')
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'test'], x="balance", y="precision", color='model name', barmode='group')
fig.update_layout(title='Precision')
fig.show('notebook')
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'test'], x="balance", y="recall", color='model name', barmode='group')
fig.update_layout(title='Recall')
fig.show('notebook')
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'test'], x="balance", y="f1_score", color='model name', barmode='group')
fig.update_layout(title='F1_score')
fig.show('notebook')
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'test'], x="balance", y="f2_score", color='model name', barmode='group')
fig.update_layout(title='F2_score')
fig.show('notebook')
fig = px.bar(mean_scores_df[mean_scores_df['step'] == 'test'], x="balance", y="gain", color='model name', barmode='group')
fig.update_layout(title='Gain')
fig.show()
y = data[['TARGET']]
X = data.drop(columns=['SK_ID_CURR','TARGET'])
features = X.columns
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=100, stratify = y)
class CustomGainScore(object):
def __call__(self, model, X, Y):
Y_preds = model.predict(X)
return compute_gain_score(Y, Y_preds)
def lgbm_cross_val(X_train, y_train) :
scale_pos_weight = Counter(y_train['TARGET'])[0]/Counter(y_train['TARGET'])[1]
print("Starting... Train shape: {}".format(X_train.shape))
print("Target counting: {}".format(Counter(y_train['TARGET'])))
search_params = {
'lgbm__reg_alpha': [0, 0.01, 0.02],
'lgbm__reg_lambda': [0, 0.1, 0.2]
}
fixed_params = {
'lgbm__learning_rate': [0.2],
'lgbm__num_leaves': [31],
'lgbm__n_estimators': [500],
'lgbm__subsample': [0.5],
'lgbm__colsample_bytree': [0.5],
'lgbm__min_child_samples' : [100]
# 'lgbm__reg_alpha': [0],
# 'lgbm__reg_lambda': [0.1]
}
param_grid = {**search_params, **fixed_params}
classifier_pipe = Pipeline(steps=(['scaler', RobustScaler()],
['lgbm', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight)]))
kfolds = StratifiedKFold(5, shuffle=True, random_state=42)
#Grid search cross-validation
grid_cv = GridSearchCV(classifier_pipe,
param_grid,
scoring = CustomGainScore(),
cv=kfolds,
n_jobs=1,
return_train_score=True,
verbose=10
)
grid_cv.fit(X_train,y_train)
print(f"BEST SCORE: {grid_cv.best_score_}")
best_model = grid_cv.best_estimator_
print(grid_cv.best_params_)
return best_model
filename = '../../gen_data/final_model.sav'
if os.path.exists(filename) :
best_model = pickle.load(open(filename, 'rb'))
else :
best_model = lgbm_cross_val(X_train, y_train)
# Sauvegarde du modèle
filename = '../../gen_data/final_model.sav'
pickle.dump(best_model, open(filename, 'wb'))
Starting... Train shape: (246008, 100)
Target counting: Counter({0.0: 226148, 1.0: 19860})
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 1/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.834, test=0.606) total time= 15.5s
[CV 2/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 2/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.833, test=0.609) total time= 17.6s
[CV 3/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 3/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.831, test=0.600) total time= 13.3s
[CV 4/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 4/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.834, test=0.614) total time= 17.7s
[CV 5/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 5/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.833, test=0.608) total time= 13.0s
[CV 1/5; 2/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 1/5; 2/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.836, test=0.604) total time= 17.3s
[CV 2/5; 2/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 2/5; 2/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.833, test=0.612) total time= 17.6s
[CV 3/5; 2/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 3/5; 2/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.836, test=0.604) total time= 13.7s
[CV 4/5; 2/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 4/5; 2/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.834, test=0.607) total time= 13.2s
[CV 5/5; 2/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 5/5; 2/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.834, test=0.606) total time= 13.5s
[CV 1/5; 3/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 1/5; 3/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.835, test=0.601) total time= 14.0s
[CV 2/5; 3/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 2/5; 3/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.837, test=0.608) total time= 13.9s
[CV 3/5; 3/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 3/5; 3/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.836, test=0.605) total time= 13.7s
[CV 4/5; 3/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 4/5; 3/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.832, test=0.608) total time= 13.6s
[CV 5/5; 3/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 5/5; 3/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.832, test=0.604) total time= 13.6s
[CV 1/5; 4/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 1/5; 4/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.835, test=0.608) total time= 13.9s
[CV 2/5; 4/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 2/5; 4/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.834, test=0.605) total time= 14.2s
[CV 3/5; 4/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 3/5; 4/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.836, test=0.605) total time= 18.7s
[CV 4/5; 4/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 4/5; 4/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.833, test=0.607) total time= 22.2s
[CV 5/5; 4/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 5/5; 4/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.834, test=0.605) total time= 14.6s
[CV 1/5; 5/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 1/5; 5/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.836, test=0.603) total time= 18.6s
[CV 2/5; 5/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 2/5; 5/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.834, test=0.610) total time= 18.5s
[CV 3/5; 5/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 3/5; 5/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.835, test=0.604) total time= 18.8s
[CV 4/5; 5/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 4/5; 5/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.829, test=0.607) total time= 13.9s
[CV 5/5; 5/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 5/5; 5/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.834, test=0.608) total time= 14.2s
[CV 1/5; 6/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 1/5; 6/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.836, test=0.608) total time= 14.9s
[CV 2/5; 6/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 2/5; 6/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.834, test=0.607) total time= 19.2s
[CV 3/5; 6/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 3/5; 6/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.833, test=0.604) total time= 20.5s
[CV 4/5; 6/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 4/5; 6/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.833, test=0.607) total time= 18.7s
[CV 5/5; 6/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 5/5; 6/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.01, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.833, test=0.606) total time= 18.7s
[CV 1/5; 7/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 1/5; 7/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.834, test=0.605) total time= 17.6s
[CV 2/5; 7/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 2/5; 7/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.833, test=0.606) total time= 13.8s
[CV 3/5; 7/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 3/5; 7/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.837, test=0.606) total time= 19.0s
[CV 4/5; 7/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 4/5; 7/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.832, test=0.610) total time= 14.9s
[CV 5/5; 7/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5
[CV 5/5; 7/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0, lgbm__subsample=0.5;, score=(train=0.834, test=0.605) total time= 13.9s
[CV 1/5; 8/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 1/5; 8/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.833, test=0.604) total time= 14.1s
[CV 2/5; 8/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 2/5; 8/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.832, test=0.607) total time= 18.5s
[CV 3/5; 8/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 3/5; 8/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.832, test=0.604) total time= 13.7s
[CV 4/5; 8/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 4/5; 8/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.831, test=0.605) total time= 14.1s
[CV 5/5; 8/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 5/5; 8/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.830, test=0.604) total time= 18.6s
[CV 1/5; 9/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 1/5; 9/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.833, test=0.602) total time= 18.6s
[CV 2/5; 9/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 2/5; 9/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.833, test=0.609) total time= 14.2s
[CV 3/5; 9/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 3/5; 9/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.834, test=0.607) total time= 13.8s
[CV 4/5; 9/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 4/5; 9/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.833, test=0.610) total time= 19.0s
[CV 5/5; 9/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5
[CV 5/5; 9/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.2, lgbm__min_child_samples=100, lgbm__n_estimators=500, lgbm__num_leaves=31, lgbm__reg_alpha=0.02, lgbm__reg_lambda=0.2, lgbm__subsample=0.5;, score=(train=0.830, test=0.609) total time= 14.3s
BEST SCORE: 0.607504839535553
{'lgbm__colsample_bytree': 0.5, 'lgbm__learning_rate': 0.2, 'lgbm__min_child_samples': 100, 'lgbm__n_estimators': 500, 'lgbm__num_leaves': 31, 'lgbm__reg_alpha': 0.02, 'lgbm__reg_lambda': 0.2, 'lgbm__subsample': 0.5}
print('Scores obtenus pour les données de validation')
print('-------------------------------')
evaluate_model(best_model, X_valid, y_valid,0.5)
Scores obtenus pour les données de validation ------------------------------- Confusion matrix: [[45673 10865] [ 2317 2648]] auc : 0.7435 f1_score :0.2866 f2_score :0.3967 accuracy :0.7857 precision :0.1960 recall : 0.5333 gain : 0.6029
y_pred_proba = best_model.predict_proba(X_valid)[::,1]
y_pred = best_model.predict(X_valid)
threshold_array = np.linspace(0, 1, 100)
gain_scores = [compute_gain_score(y_valid, to_labels(y_pred_proba, t)) for t in threshold_array]
# récupération du meilleur seuil (maximisation du gain)
maxgain_ix = argmax(gain_scores)
best_threshold = threshold_array[maxgain_ix]
max_gain = gain_scores[maxgain_ix]
print('Seuil=%.3f, gain maximum=%.5f' % (best_threshold, max_gain))
Seuil=0.616, gain maximum=0.62372
fig = px.scatter(x=threshold_array, y=gain_scores)
fig.add_vline(x = best_threshold)
fig.update_layout(
title_text="Evolution du gain"
)
# Set x-axis title
fig.update_xaxes(title_text="Seuil de probabilité")
# Set y-axes titles
fig.update_yaxes(title_text="gain")
fig.show('notebook')
print('Scores obtenus pour les données de validation avec le threshold %.5f' %best_threshold)
print('-------------------------------')
evaluate_model(best_model,X_valid, y_valid, best_threshold)
Scores obtenus pour les données de validation avec le threshold 0.61616 ------------------------------- Confusion matrix: [[50279 6259] [ 3003 1962]] auc : 0.7435 f1_score :0.2976 f2_score :0.3493 accuracy :0.8494 precision :0.2387 recall : 0.3952 gain : 0.6237
Le modèle Light GBM permet de récupérer l'attribut feature_importances_
feature_importance_df = pd.DataFrame()
feature_importance_df['importance'] = best_model['lgbm'].feature_importances_
feature_importance_df.index = features
feature_importance_df = feature_importance_df.sort_values(
by='importance', ascending=False)
most_important_features = list(feature_importance_df.nlargest(20, columns=['importance']).index)
def show_global_importance(feature_importance_df, num_features):
df = feature_importance_df.nlargest(num_features, columns=['importance'])
fig = px.bar(df, orientation='h')
fig.update_yaxes(title='Importance')
fig.update_xaxes(title='Feature')
fig.update_traces(showlegend=False)
fig.update_layout(
title="Importance globale des features",
font_size=11,
height=800,
width=600)
fig.show('notebook')
show_global_importance(feature_importance_df, 20)
On peut aussi visualiser les influences locales respectives sur un sous-ensemble de données
small_sample = get_sample_for_testing(data,0.01)
small_sample.shape
(3075, 102)
y_small_sample = small_sample[['TARGET']]
X_small_sample = small_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns
Light GBM permet grâce à un paramètre (pred_contrib) de calculer les valeurs SHAP de chaque features, par individu.
shap_values= best_model.predict(X_small_sample.values,pred_contrib=True)
shap_df = pd.DataFrame(shap_values[:,0:len(features)], columns=features)
shap_best_df = shap_df[most_important_features]
def plot_local_importance_chart(shap_best_df) :
df = pd.melt(shap_best_df, value_vars=shap_best_df.columns).rename(columns={
"variable": "features",
"value": "shap_value"
})
fig = px.scatter(df, y="features", x="shap_value", color='shap_value')
fig.update_traces(marker_size=3)
fig.update_layout(
title="Influences locales des features pour chaque point",
font_size=11,
height=800,
width=800)
fig.show('notebook')
plot_local_importance_chart(shap_best_df)
! jupyter nbconvert training_and_optimisation.ipynb --to html
# Append `--ExtractOutputPreprocessor.enabled=False`